# usual data science stack in python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# imports of need modules in sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.dummy import DummyClassifier
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')


# load main datasets
app_train, app_test = pd.read_csv("./data/application_train.csv"), pd.read_csv("./data/application_test.csv")


display(app_train.head(3))
display(app_test.head(3))


app_train.shape, app_test.shape

((307511, 122), (48744, 121))


app_train.TARGET.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64


print("Pourcentage de clients en difficultés de payments:",app_train.TARGET.sum() / app_train.shape[0] * 100 ,"%")

Pourcentage de clients en difficultés de payments: 8.072881945686495 %


plt.title('Distribution de la colonne "TARGET" - 1 -> client avec difficultés de payments / 0 - les autres cas')
sns.countplot(x=app_train.TARGET, data=app_train)
plt.show()


app_train.info()
print("----")
app_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
----
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48744 entries, 0 to 48743
Columns: 121 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(40), object(16)
memory usage: 45.0+ MB


display(app_train.describe())
print("<---->")
display(app_test.describe())

<---->


# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64


# Function to calculate missing values by column# Funct // credits Will Koehrsen
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns


# Missing values statistics
missing_values = missing_values_table(app_train)
missing_values.head(12)

Your selected dataframe has 122 columns.
There are 67 columns that have missing values.


# cols_to_drop = list((app_train.isnull().sum() > 75000).index)
cols_to_drop = [c for c in app_train.columns if app_train[c].isnull().sum() > 75000]
app_train, app_test = app_train.drop(cols_to_drop, axis=1), app_test.drop(cols_to_drop, axis=1)
app_test.isnull().sum().sort_values(ascending=False).head(10)

EXT_SOURCE_3                  8668
AMT_REQ_CREDIT_BUREAU_YEAR    6049
AMT_REQ_CREDIT_BUREAU_QRT     6049
AMT_REQ_CREDIT_BUREAU_MON     6049
AMT_REQ_CREDIT_BUREAU_WEEK    6049
AMT_REQ_CREDIT_BUREAU_DAY     6049
AMT_REQ_CREDIT_BUREAU_HOUR    6049
NAME_TYPE_SUITE                911
DEF_60_CNT_SOCIAL_CIRCLE        29
OBS_30_CNT_SOCIAL_CIRCLE        29
dtype: int64


obj_cols = app_train.select_dtypes('object').columns
obj_cols

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')


# filling string cols with 'Not specified' 
app_train[obj_cols] = app_train[obj_cols].fillna('Not specified')
app_test[obj_cols] = app_test[obj_cols].fillna('Not specified')


float_cols = app_train.select_dtypes('float').columns
float_cols

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'REGION_POPULATION_RELATIVE', 'DAYS_REGISTRATION', 'CNT_FAM_MEMBERS',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'OBS_30_CNT_SOCIAL_CIRCLE',
       'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
       'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
       'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')


# filling float values with median of train (not test)
app_train[float_cols] = app_train[float_cols].fillna(app_train[float_cols].median())
app_test[float_cols] = app_test[float_cols].fillna(app_test[float_cols].median())
app_train.shape, app_test.shape

((307511, 72), (48744, 71))


# Number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                8
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
dtype: int64


app_train['DAYS_EMPLOYED'].describe()

count    307511.000000
mean      63815.045904
std      141275.766519
min      -17912.000000
25%       -2760.000000
50%       -1213.000000
75%        -289.000000
max      365243.000000
Name: DAYS_EMPLOYED, dtype: float64


sns.distplot(app_train['DAYS_EMPLOYED'], kde=False);
plt.show()


print('The non-anomalies default on %0.2f%% of loans' % (100 * app_train[app_train['DAYS_EMPLOYED'] != 365243]['TARGET'].mean()))
print('The anomalies default on %0.2f%% of loans' % (100 * app_train[app_train['DAYS_EMPLOYED'] == 365243]['TARGET'].mean()))
print('There are %d anomalous days of employment' % len(app_train[app_train['DAYS_EMPLOYED'] == 365243]))

The non-anomalies default on 8.66% of loans
The anomalies default on 5.40% of loans
There are 55374 anomalous days of employment


# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

# Replace the anomalous values with nan
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

sns.distplot(app_train['DAYS_EMPLOYED'].dropna(), kde=False);


app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

print('Il y a %d anomalies dans le test data sur %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))

Il y a 9274 anomalies dans le test data sur 48744 entries


# refilling float values with median of train (not test)

app_train[float_cols] = app_train[float_cols].apply(pd.to_numeric, errors='coerce')
app_train = app_train.fillna(app_train.median())

app_test[float_cols] = app_test[float_cols].apply(pd.to_numeric, errors='coerce')
app_test = app_train.fillna(app_test.median())


correlations = app_train.corr()['TARGET'].sort_values()

print('Most Positive Correlations:\n', correlations.tail(10))
print('\n\nMost Negative Correlations:\n', correlations.head(10))

Most Positive Correlations:
 REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_EMP_PHONE                 0.045982
REG_CITY_NOT_WORK_CITY         0.050994
DAYS_ID_PUBLISH                0.051457
DAYS_LAST_PHONE_CHANGE         0.055218
REGION_RATING_CLIENT           0.058899
REGION_RATING_CLIENT_W_CITY    0.060893
DAYS_EMPLOYED                  0.063368
DAYS_BIRTH                     0.078239
TARGET                         1.000000
Name: TARGET, dtype: float64


Most Negative Correlations:
 EXT_SOURCE_2                 -0.160295
EXT_SOURCE_3                 -0.155892
DAYS_EMPLOYED_ANOM           -0.045987
AMT_GOODS_PRICE              -0.039623
REGION_POPULATION_RELATIVE   -0.037227
AMT_CREDIT                   -0.030369
FLAG_DOCUMENT_6              -0.028602
HOUR_APPR_PROCESS_START      -0.024166
FLAG_PHONE                   -0.023806
AMT_REQ_CREDIT_BUREAU_MON    -0.014794
Name: TARGET, dtype: float64


# Compute the correlation matrix
corr = app_train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(21, 19))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>


# Find the correlation of the positive days since birth and target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])

-0.07823930830984513


plt.figure(figsize = (12, 6))

# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

# KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');


# Age information into a separate dataframe
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)


# Group by the bin and calculate averages
age_groups  = age_data.groupby('YEARS_BINNED').mean()
age_groups


plt.figure(figsize = (8, 6))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');


app_train = pd.get_dummies(data=app_train, columns=obj_cols)
app_test = pd.get_dummies(data=app_test, columns=obj_cols)


# back up of the target /  need to keep this information
y = app_train.TARGET
app_train = app_train.drop(columns=['TARGET'])


app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train.shape, app_test.shape

((307511, 168), (307511, 168))


feat_to_scale = list(float_cols).copy()
feat_to_scale.extend(['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'HOUR_APPR_PROCESS_START'])
feat_to_scale

['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_REGISTRATION',
 'CNT_FAM_MEMBERS',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'CNT_CHILDREN',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'HOUR_APPR_PROCESS_START']


scaler = StandardScaler()
app_train[feat_to_scale] = scaler.fit_transform(app_train[feat_to_scale])
app_test[feat_to_scale] = scaler.fit_transform(app_test[feat_to_scale])
app_train.head()


X_train, X_test, y_train, y_test = train_test_split(app_train, y, test_size=0.2)


from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV


nbSplits=2
nbRepeats=2
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spaceDummyClf = [{'constant': [None], 'random_state': [None], 'strategy': ["stratified", "most_frequent", "prior", "uniform","constant"]}]
search= RandomizedSearchCV(estimator = dummy_clf, param_distributions = spaceDummyClf, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(app_train, y)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))

Fitting 4 folds for each of 5 candidates, totalling 20 fits
roc: 0.5
F1: 0.8780010382781587


plot_confusion_matrix(dummy_clf, X_test, y_test)  
plt.show()


nbSplits=2
nbRepeats=2

model_randomforest = RandomForestClassifier()
model_randomforest.fit(X_train,y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spacerandomforest = [{'bootstrap': [True], 'ccp_alpha': [0.0], 'class_weight': ['balanced'], 'criterion': ['gini'], 'max_depth': [None], 'max_features': ['auto'], 'max_leaf_nodes': [None], 'max_samples': [None], 'min_impurity_decrease': [0.0], 'min_samples_leaf': [1], 'min_samples_split':[2], 'min_weight_fraction_leaf': [0.0], 'n_estimators': [10,50], 'n_jobs': [-1,2], 'oob_score': [False], 'random_state': [None],'warm_start': [False]}]
search= RandomizedSearchCV(estimator = model_randomforest, param_distributions = spacerandomforest, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))

Fitting 4 folds for each of 4 candidates, totalling 16 fits
roc: 0.5000808004690401
F1: 0.878024753363463


plot_confusion_matrix(model_randomforest, X_test, y_test)  
plt.show()


nbSplits=2
nbRepeats=2

model_lightgbm = lgb.LGBMClassifier()
model_lightgbm.fit(X_train,y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spacelightgbm = [{'boosting_type': ['gbdt'], 'class_weight': ['balanced'], 'colsample_bytree': [1.0], 'importance_type': ['split'], 'learning_rate': [0.1,0.2], 'max_depth': [-1], 'n_estimators': [50], 'n_jobs': [-1], 'num_leaves': [31], 'objective': [None], 'random_state': [50], 'reg_alpha': [0.0,0.2], 'reg_lambda': [0.0,0.2], 'silent': [True], 'subsample': [1.0], }]
search= RandomizedSearchCV(estimator = model_lightgbm, param_distributions = spacelightgbm, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))

Fitting 4 folds for each of 8 candidates, totalling 32 fits
roc: 0.6860200208050994
F1: 0.763578053725843


plot_confusion_matrix(model_lightgbm, X_test, y_test)  
plt.show()


nbSplits=2
nbRepeats=2

model_xgBoost = xgb.XGBClassifier()
model_xgBoost.fit(X_train, y_train)
cv = RepeatedKFold(n_splits=nbSplits, n_repeats=nbRepeats, random_state=1)
spaceXgBoost = [{'objective':["binary:logistic"], 'random_state':[10,20], 'eval_metric':['auc'], 'max_delta_step':[2,4], 'scale_pos_weight':[10,20]}]
search= RandomizedSearchCV(estimator = model_xgBoost, param_distributions = spaceXgBoost, n_iter = 30, cv = cv, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
search.fit(X_train, y_train)
print("roc:",roc_auc_score(y_test, search.predict(X_test)))
print("F1:",f1_score(y_test, search.predict(X_test),average='weighted'))

[12:13:14] WARNING: ..\src\learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Fitting 4 folds for each of 8 candidates, totalling 32 fits
roc: 0.6758434428493557
F1: 0.8046231342459104


plot_confusion_matrix(model_xgBoost, X_test, y_test)  
plt.show()


rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier()


importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(app_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 27 (0.064638)
2. feature 28 (0.059709)
3. feature 10 (0.046448)
4. feature 7 (0.046111)
5. feature 9 (0.045583)
6. feature 0 (0.044521)
7. feature 4 (0.042292)
8. feature 8 (0.041667)
9. feature 33 (0.040434)
10. feature 3 (0.039350)
11. feature 2 (0.035407)
12. feature 6 (0.035158)
13. feature 5 (0.034121)
14. feature 20 (0.031223)
15. feature 59 (0.022319)
16. feature 29 (0.017723)
17. feature 31 (0.017540)
18. feature 17 (0.013928)
19. feature 1 (0.009987)
20. feature 58 (0.008297)
21. feature 57 (0.007449)
22. feature 15 (0.006855)
23. feature 68 (0.006735)
24. feature 69 (0.006599)
25. feature 92 (0.006576)
26. feature 30 (0.006573)
27. feature 18 (0.006403)
28. feature 115 (0.006391)
29. feature 19 (0.006384)
30. feature 108 (0.006329)
31. feature 109 (0.006230)
32. feature 13 (0.006218)
33. feature 107 (0.006087)
34. feature 103 (0.006029)
35. feature 104 (0.005855)
36. feature 152 (0.005557)
37. feature 77 (0.005529)
38. feature 32 (0.005298)
39. feature 66 (0.005267)
40. feature 85 (0.005192)
41. feature 67 (0.005167)
42. feature 25 (0.005072)
43. feature 26 (0.004986)
44. feature 105 (0.004938)
45. feature 94 (0.004925)
46. feature 35 (0.004904)
47. feature 91 (0.004711)
48. feature 71 (0.004656)
49. feature 90 (0.004614)
50. feature 79 (0.004465)
51. feature 24 (0.004332)
52. feature 98 (0.004290)
53. feature 87 (0.004091)
54. feature 64 (0.003997)
55. feature 63 (0.003789)
56. feature 93 (0.003625)
57. feature 16 (0.003462)
58. feature 106 (0.003423)
59. feature 143 (0.003306)
60. feature 102 (0.003058)
61. feature 40 (0.002678)
62. feature 114 (0.002667)
63. feature 117 (0.002584)
64. feature 99 (0.002551)
65. feature 76 (0.002528)
66. feature 56 (0.002501)
67. feature 22 (0.002487)
68. feature 161 (0.002461)
69. feature 121 (0.002383)
70. feature 23 (0.002191)
71. feature 82 (0.002159)
72. feature 140 (0.002140)
73. feature 96 (0.002066)
74. feature 165 (0.002030)
75. feature 88 (0.002004)
76. feature 101 (0.001941)
77. feature 113 (0.001895)
78. feature 61 (0.001878)
79. feature 62 (0.001842)
80. feature 149 (0.001798)
81. feature 38 (0.001774)
82. feature 89 (0.001683)
83. feature 130 (0.001682)
84. feature 138 (0.001600)
85. feature 157 (0.001538)
86. feature 150 (0.001428)
87. feature 37 (0.001400)
88. feature 111 (0.001339)
89. feature 164 (0.001303)
90. feature 70 (0.001274)
91. feature 21 (0.001249)
92. feature 126 (0.001238)
93. feature 123 (0.001230)
94. feature 148 (0.001187)
95. feature 136 (0.001077)
96. feature 163 (0.001045)
97. feature 81 (0.001025)
98. feature 75 (0.001023)
99. feature 60 (0.000999)
100. feature 167 (0.000991)
101. feature 145 (0.000983)
102. feature 55 (0.000968)
103. feature 100 (0.000959)
104. feature 12 (0.000958)
105. feature 54 (0.000826)
106. feature 124 (0.000807)
107. feature 134 (0.000801)
108. feature 153 (0.000745)
109. feature 48 (0.000734)
110. feature 141 (0.000726)
111. feature 131 (0.000721)
112. feature 144 (0.000718)
113. feature 112 (0.000704)
114. feature 156 (0.000692)
115. feature 50 (0.000624)
116. feature 97 (0.000595)
117. feature 74 (0.000588)
118. feature 122 (0.000532)
119. feature 166 (0.000527)
120. feature 41 (0.000488)
121. feature 151 (0.000476)
122. feature 119 (0.000449)
123. feature 154 (0.000440)
124. feature 73 (0.000436)
125. feature 146 (0.000404)
126. feature 155 (0.000392)
127. feature 110 (0.000354)
128. feature 132 (0.000343)
129. feature 43 (0.000323)
130. feature 72 (0.000314)
131. feature 120 (0.000306)
132. feature 14 (0.000304)
133. feature 142 (0.000304)
134. feature 129 (0.000303)
135. feature 137 (0.000300)
136. feature 116 (0.000291)
137. feature 118 (0.000282)
138. feature 160 (0.000268)
139. feature 139 (0.000261)
140. feature 45 (0.000247)
141. feature 46 (0.000190)
142. feature 51 (0.000164)
143. feature 127 (0.000157)
144. feature 52 (0.000133)
145. feature 47 (0.000128)
146. feature 162 (0.000123)
147. feature 53 (0.000119)
148. feature 84 (0.000118)
149. feature 133 (0.000096)
150. feature 147 (0.000091)
151. feature 125 (0.000080)
152. feature 34 (0.000073)
153. feature 128 (0.000067)
154. feature 135 (0.000052)
155. feature 86 (0.000049)
156. feature 159 (0.000048)
157. feature 39 (0.000044)
158. feature 49 (0.000031)
159. feature 158 (0.000030)
160. feature 80 (0.000018)
161. feature 83 (0.000002)
162. feature 42 (0.000001)
163. feature 36 (0.000001)
164. feature 44 (0.000001)
165. feature 65 (0.000000)
166. feature 78 (0.000000)
167. feature 11 (0.000000)
168. feature 95 (0.000000)


# Plot the feature importances of the rf
plt.figure(figsize=(16, 8))
plt.title("Feature importances")
plt.bar(range(app_train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(app_train.shape[1]), indices)
plt.xlim([-1, app_train.shape[1]])
plt.show()


(pd.Series(rf.feature_importances_, index=app_train.columns)
   .nlargest(15)
   .plot(kind='barh'))

<AxesSubplot:>


xgb.plot_importance(model_xgBoost,height=100,max_num_features=12)

<AxesSubplot:title={'center':'Feature importance'}, xlabel='F score', ylabel='Features'>


# fit on the whole dataset of train
xgbm.fit(app_train, y)

# Make predictions & make sure to select the second column only
result = xgbm.predict_proba(app_test)[:, 1]

submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = result
submit.head(5)

	SK_ID_CURR	TARGET	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	REGION_POPULATION_RELATIVE	DAYS_BIRTH	DAYS_EMPLOYED	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
count	307511.000000	307511.000000	307511.000000	3.075110e+05	3.075110e+05	307499.000000	3.072330e+05	307511.000000	307511.000000	307511.000000	...	307511.000000	307511.000000	307511.000000	307511.000000	265992.000000	265992.000000	265992.000000	265992.000000	265992.000000	265992.000000
mean	278180.518577	0.080729	0.417052	1.687979e+05	5.990260e+05	27108.573909	5.383962e+05	0.020868	-16036.995067	63815.045904	...	0.008130	0.000595	0.000507	0.000335	0.006402	0.007000	0.034362	0.267395	0.265474	1.899974
std	102790.175348	0.272419	0.722121	2.371231e+05	4.024908e+05	14493.737315	3.694465e+05	0.013831	4363.988632	141275.766519	...	0.089798	0.024387	0.022518	0.018299	0.083849	0.110757	0.204685	0.916002	0.794056	1.869295
min	100002.000000	0.000000	0.000000	2.565000e+04	4.500000e+04	1615.500000	4.050000e+04	0.000290	-25229.000000	-17912.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	189145.500000	0.000000	0.000000	1.125000e+05	2.700000e+05	16524.000000	2.385000e+05	0.010006	-19682.000000	-2760.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	278202.000000	0.000000	0.000000	1.471500e+05	5.135310e+05	24903.000000	4.500000e+05	0.018850	-15750.000000	-1213.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
75%	367142.500000	0.000000	1.000000	2.025000e+05	8.086500e+05	34596.000000	6.795000e+05	0.028663	-12413.000000	-289.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000
max	456255.000000	1.000000	19.000000	1.170000e+08	4.050000e+06	258025.500000	4.050000e+06	0.072508	-7489.000000	365243.000000	...	1.000000	1.000000	1.000000	1.000000	4.000000	9.000000	8.000000	27.000000	261.000000	25.000000

	SK_ID_CURR	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	REGION_POPULATION_RELATIVE	DAYS_BIRTH	DAYS_EMPLOYED	DAYS_REGISTRATION	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
count	48744.000000	48744.000000	4.874400e+04	4.874400e+04	48720.000000	4.874400e+04	48744.000000	48744.000000	48744.000000	48744.000000	...	48744.000000	48744.0	48744.0	48744.0	42695.000000	42695.000000	42695.000000	42695.000000	42695.000000	42695.000000
mean	277796.676350	0.397054	1.784318e+05	5.167404e+05	29426.240209	4.626188e+05	0.021226	-16068.084605	67485.366322	-4967.652716	...	0.001559	0.0	0.0	0.0	0.002108	0.001803	0.002787	0.009299	0.546902	1.983769
std	103169.547296	0.709047	1.015226e+05	3.653970e+05	16016.368315	3.367102e+05	0.014428	4325.900393	144348.507136	3552.612035	...	0.039456	0.0	0.0	0.0	0.046373	0.046132	0.054037	0.110924	0.693305	1.838873
min	100001.000000	0.000000	2.694150e+04	4.500000e+04	2295.000000	4.500000e+04	0.000253	-25195.000000	-17463.000000	-23722.000000	...	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	188557.750000	0.000000	1.125000e+05	2.606400e+05	17973.000000	2.250000e+05	0.010006	-19637.000000	-2910.000000	-7459.250000	...	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	277549.000000	0.000000	1.575000e+05	4.500000e+05	26199.000000	3.960000e+05	0.018850	-15785.000000	-1293.000000	-4490.000000	...	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000
75%	367555.500000	1.000000	2.250000e+05	6.750000e+05	37390.500000	6.300000e+05	0.028663	-12496.000000	-296.000000	-1901.000000	...	0.000000	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	1.000000	3.000000
max	456250.000000	20.000000	4.410000e+06	2.245500e+06	180576.000000	2.245500e+06	0.072508	-7338.000000	365243.000000	0.000000	...	1.000000	0.0	0.0	0.0	2.000000	2.000000	2.000000	6.000000	7.000000	17.000000

	Missing Values	% of Total Values
COMMONAREA_MEDI	214865	69.9
COMMONAREA_AVG	214865	69.9
COMMONAREA_MODE	214865	69.9
NONLIVINGAPARTMENTS_MEDI	213514	69.4
NONLIVINGAPARTMENTS_MODE	213514	69.4
NONLIVINGAPARTMENTS_AVG	213514	69.4
FONDKAPREMONT_MODE	210295	68.4
LIVINGAPARTMENTS_MODE	210199	68.4
LIVINGAPARTMENTS_MEDI	210199	68.4
LIVINGAPARTMENTS_AVG	210199	68.4
FLOORSMIN_MODE	208642	67.8
FLOORSMIN_MEDI	208642	67.8

	TARGET	DAYS_BIRTH	YEARS_BIRTH	YEARS_BINNED
0	1	9461	25.920548	(25.0, 30.0]
1	0	16765	45.931507	(45.0, 50.0]
2	0	19046	52.180822	(50.0, 55.0]
3	0	19005	52.068493	(50.0, 55.0]
4	0	19932	54.608219	(50.0, 55.0]
5	0	16941	46.413699	(45.0, 50.0]
6	0	13778	37.747945	(35.0, 40.0]
7	0	18850	51.643836	(50.0, 55.0]
8	0	20099	55.065753	(55.0, 60.0]
9	0	14469	39.641096	(35.0, 40.0]

	TARGET	DAYS_BIRTH	YEARS_BIRTH
YEARS_BINNED
(20.0, 25.0]	0.123036	8532.795625	23.377522
(25.0, 30.0]	0.111436	10155.219250	27.822518
(30.0, 35.0]	0.102814	11854.848377	32.479037
(35.0, 40.0]	0.089414	13707.908253	37.555913
(40.0, 45.0]	0.078491	15497.661233	42.459346
(45.0, 50.0]	0.074171	17323.900441	47.462741
(50.0, 55.0]	0.066968	19196.494791	52.593136
(55.0, 60.0]	0.055314	20984.262742	57.491131
(60.0, 65.0]	0.052737	22780.547460	62.412459
(65.0, 70.0]	0.037270	24292.614340	66.555108

Objectif¶

les valeurs manquantes¶

supression des colonnes avec un grand nombre de valeurs manquantes¶

Colmater les valeurs manquantes¶

Sur les colonnes categoriques¶

Traitement des anomalies¶

Correlations¶

L'influence de l'Age sur le repayement du prêt¶

Encodages des varaibles catégoriques¶

Alignement des datasets¶

Mise à l'échelle des valeurs¶

Splitting training / test datasets¶

Algo : DummyClassifier, random forest, lightgbm et xgboost ave cross validation¶

Pour le modèle Dummy classifier¶

Pour le modèle Random forest¶

Pour le modèle LightGBM¶

Pour le modèle Xgboost¶

Feature Importance : les variables les plus important¶

Feature importance with xgboost¶

have final results¶

	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002	1	Cash loans	M	N	Y	202500.0	406597.5	24700.5	...	1.0
1	100003	0	Cash loans	F	N	N	270000.0	1293502.5	35698.5	...	0.0
2	100004	0	Revolving loans	M	Y	Y	67500.0	135000.0	6750.0	...	0.0

	SK_ID_CURR	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	...	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100001	Cash loans	F	N	Y	135000.0	568800.0	20560.5	450000.0	...	0.0	0.0
1	100005	Cash loans	M	N	Y	99000.0	222768.0	17370.0	180000.0	...	0.0	3.0
2	100013	Cash loans	M	Y	Y	202500.0	663264.0	69777.0	630000.0	...	1.0	4.0

	SK_ID_CURR	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	REGION_POPULATION_RELATIVE	DAYS_BIRTH	DAYS_EMPLOYED	DAYS_REGISTRATION	...
0	100002	-0.577538	0.142129	-0.478095	-0.166143	-0.507236	-0.149452	-1.506880	0.755835	0.379837	...
1	100003	-0.577538	0.426792	1.725450	0.592683	1.600873	-1.252750	0.166821	0.497899	1.078697	...
2	100004	-0.577538	-0.427196	-1.152888	-1.404669	-1.092145	-0.783451	0.689509	0.948701	0.206116	...
3	100006	-0.577538	-0.142533	-0.711430	0.177874	-0.653463	-0.928991	0.680114	-0.368597	-1.375829	...
4	100007	-0.577538	-0.199466	-0.213734	-0.361749	-0.068554	0.563570	0.892535	-0.368129	0.191639	...